Brian M. Schilder, Bioinformatician II
# Import functions
root = "./"
source(file.path(root,"general_functions.R"))
import_parameters(params)## **** __Utilized Cores__ **** = 2$subsetGenes
## [1] "protein_coding"
##
## $subsetCells
## [1] 500
##
## $resolution
## [1] 0.6
##
## $resultsPath
## [1] "./Results"
##
## $nCores
## [1] 2
##
## $perplexity
## [1] 30
# load(file.path(resultsPath, "scRNAseq_results.RData"))
resultsPath <- "Results/subsetGenes-protein_coding__subsetCells-F__Resolution-0.2__perplexity-40__nCores-4"
load(file.path(resultsPath,"scRNAseq_results.RData"))library(Seurat)
library(cowplot)
library(ggplot2)
library(dplyr)
library(data.table)
library(readxl)
library(reshape2)
library(GeneOverlap) # BiocManager::install("GeneOverlap")
library(monocle) # BiocManager::install("monocle")
# BiocManager::install(c('DelayedArray', 'DelayedMatrixStats', 'org.Hs.eg.db', 'org.Mm.eg.db'))
library(org.Hs.eg.db)
library(garnett) # devtools::install_github("cole-trapnell-lab/garnett")
library(enrichR) #BiocManager::install("enrichR")# Convert Seurat objt to CDS object
mDAT <- monocle::importCDS(DAT, import_all = T)
# generate size factors for normalization later
mDAT <- DESeq2::estimateSizeFactors(mDAT)
# Get pre-trained PBMC classifer
load(file.path(root, "Data/Garnett/hsPBMC")) # Download from: https://cole-trapnell-lab.github.io/garnett/classifiers/hsPBMC
mDAT <- garnett::classify_cells(mDAT, hsPBMC,
db = org.Hs.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")## Warning in doTryCatch(return(expr), name, parentenv, handler): The
## following genes used in the classifier are not present in the input CDS.
## Interpret with caution. ENSG00000174059The following genes used in the
## classifier are not present in the input CDS. Interpret with caution.
## ENSG00000007062The following genes used in the classifier are not present
## in the input CDS. Interpret with caution. ENSG00000157404The following
## genes used in the classifier are not present in the input CDS. Interpret
## with caution. ENSG00000185291
table(pData(mDAT)$cell_type)##
## B cells CD34+ CD4 T cells CD8 T cells
## 13 92 3 1
## Dendritic cells Monocytes NK cells T cells
## 280 15025 801 16
## Unknown
## 2913
table(pData(mDAT)$cluster_ext_type) ##
## B cells CD34+ CD4 T cells Dendritic cells
## 12 17 3 165
## Monocytes NK cells T cells Unknown
## 17165 701 15 1066
# Get feature genes for each cell type
feature_genes <- garnett::get_feature_genes(classifier = hsPBMC,
node = "root",
db = org.Hs.eg.db,
convert_ids = F)
head(feature_genes) ## B cells CD34+ Dendritic cells Monocytes
## (Intercept) -0.884989922 0.47694427 -0.124629734 -0.0301907756
## ENSG00000196154 -0.005499815 -0.01088859 -0.007978714 0.0056365451
## ENSG00000019582 0.030146634 -0.01959215 0.015232191 0.0001883507
## ENSG00000156738 2.193353690 -0.66191294 -1.104899464 -0.7393680112
## ENSG00000177455 2.056641938 -0.71851810 -1.941859710 -1.0448682467
## ENSG00000105369 1.800296145 -0.53835614 0.063380804 -0.1973728377
## NK cells T cells Unknown
## (Intercept) -0.911657164 0.524757370 0.949765956
## ENSG00000196154 -0.001627233 -0.005835858 0.026193667
## ENSG00000019582 -0.005880096 -0.023473024 0.003378099
## ENSG00000156738 -0.096566144 -0.602417257 1.011810124
## ENSG00000177455 0.297279508 0.177790907 1.173533704
## ENSG00000105369 -0.618567693 -0.963141845 0.453761563
# Convert back to Seurat object & save results
DAT <- Seurat::AddMetaData(DAT, pData(mDAT)[c("garnett_cluster","cell_type","cluster_ext_type")])
write.csv(DAT@meta.data, file.path(resultsPath, "garnett_results.csv"), quote = F,row.names = F)markerList <- c("CD14", "FCGR3A")
markerData <- DAT@scale.data[row.names(DAT@scale.data) %in% markerList,] %>% t() %>% data.frame()
## Efficiently merge using data.table
dt1 <- data.table(markerData, keep.rownames = "Cell", key = "Cell") %>% copy()
dt2 <- data.table(DAT@meta.data[c("cell_type","post_clustering")], keep.rownames = "Cell", key = "Cell") %>% copy()
markerDT <- dt1[dt2]
ggplot(data = markerDT, aes(x=CD14, y=FCGR3A) ) +
stat_density_2d(aes(fill = ..level..), geom = "polygon", colour="purple", bins = 100, size=.1) +
geom_point(aes(color=as.factor(cell_type)), shape=16, stroke=0, size=2, alpha=.1) +
guides(colour = guide_legend(title="cell_type",override.aes = list(alpha = 1))) tSNE_metadata_plot <- function(var, labSize=12){
# Metadata plot
p1 <- Seurat::TSNEPlot(DAT, do.return = T, do.label = T, group.by = var,label.size = labSize,
plot.title=paste(var), vector.friendly=T) +
theme(legend.position = "top") +
scale_color_brewer( palette = "Dark2", aesthetics = aes(alpha=.5))
# t-SNE clusters plot
p2 <- Seurat::TSNEPlot(DAT, do.return = T, do.label = T,label.size = labSize,
plot.title=paste("Unsupervised Clusters"), vector.friendly=T) +
theme(legend.position = "top") +
scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))
print(cowplot::plot_grid(p1,p2))
} tSNE_metadata_plot("cell_type")tSNE_metadata_plot("cluster_ext_type")tSNE_metadata_plot("mut")FeaturePlot(DAT,features.plot = c("CD14","FCGR3A", "LRRK2", "GBA"),
cols.use = c("grey","purple","magenta"), dark.theme = T, nCol = 2 )FeatureHeatmap(DAT, c("CD14","FCGR3A","LRRK2", "GBA"), pt.size = 0.5,
cols.use = c("grey","purple"), plot.horiz =T) Q: Is AD-related gene expression more prevalent in classical, intermediate or non-classical monocyte subtypes?
ADgenes <- read_excel(file.path(root, "Data/AD_genes.xlsx"))
MonocyteSubtype.markers <- read.csv(file.path(root, resultsPath, "MonocyteSubtype.markers.csv"), row.names = 1)
MonocyteSubtype.markers_sig <- subset(MonocyteSubtype.markers, p_val_adj <= 0.05)genomeSize <- dim(DAT@raw.data)[1]
list1 <- row.names(MonocyteSubtype.markers_sig)
list2 <- ADgenes$Gene
go.obj <- newGeneOverlap(listA = list1, listB = list2, genome.size = genomeSize )
go.obj <- testGeneOverlap(go.obj)
print(go.obj)## Detailed information about this GeneOverlap object:
## listA size=205, e.g. FCGR3A RHOC CDKN1C
## listB size=21, e.g. SPI1 CD33 PILRA
## Intersection size=9, e.g. IFITM3 IFITM2 IFITM1
## Union size=217, e.g. FCGR3A RHOC CDKN1C
## Genome size=24914
## # Contingency Table:
## notA inA
## notB 24697 196
## inB 12 9
## Overlapping p-value=3.9e-14
## Odds ratio=94.4
## Overlap tested using Fisher's exact test (alternative=greater)
## Jaccard Index=0.0
overlappingGenes <- getIntersection(go.obj)
percent_of_ADgenes <- length(overlappingGenes) / length(ADgenes$Gene)*100
percent_of_DEGs <- length(overlappingGenes) / length(row.names(MonocyteSubtype.markers_sig))*100
AD_DEGs <- subset(ADgenes, Gene %in% overlappingGenes)
cat("\n",round(percent_of_ADgenes, 2),"% of the AD-risk genes (",length(overlappingGenes),"/",length(ADgenes$Gene),") are differentially expressed between Canonical vs. Intermediate monocyte subtypes.")##
## 42.86 % of the AD-risk genes ( 9 / 21 ) are differentially expressed between Canonical vs. Intermediate monocyte subtypes.
cat("\n",round(percent_of_DEGs, 2),"% of the DEGs between Canonical vs. Intermediate monocyte subtypes (",
length(overlappingGenes),"/",length(row.names(MonocyteSubtype.markers_sig)),") are in the AD-risk gene list.")##
## 4.39 % of the DEGs between Canonical vs. Intermediate monocyte subtypes ( 9 / 205 ) are in the AD-risk gene list.
# percent_cells_expressing <- function(DAT, ADgenes, clusterList){
# for(clust in clusterList){
# cat("\nProcessing cluster",clust)
# clustDAT <- SubsetData(DAT, subset.name = "post_clustering", accept.value = clust)
# # Seurat:::PercentAbove(x = DAT@data, threshold = 0) # Very inefficient
# subDAT <- clustDAT@data[row.names(clustDAT@data ) %in% ADgenes$Gene,]
# for(gene in row.names(subDAT)){
# cat(".")
# ADgenes[ADgenes$Gene==gene,paste("cluster",clust,sep="")] <- mean(ifelse(subDAT[gene,] > 0, 1, 0))
# }
# }
# ADgenes <- melt(ADgenes, id.vars = c("Gene","Category"), measure.vars = paste("cluster",clusterList,sep=""),
# variable.name = "Cluster",
# value.name = "cellsExpressing")
# ggplot(data=AD_DEGs, aes(x=Cluster, y=cellsExpressing, fill=Gene)) + geom_col(position="dodge") +
# labs(title = "Fraction of Cells Expressing AD-associated Genes", y="Fraction of Cells") +
# scale_x_discrete(labels=c("Cluster0\n(Canonical)", "Cluster1\n(Intermediate)"))
# return(ADgenes)
# }
MonocyteSubtype.markers_sig$Gene <- row.names(MonocyteSubtype.markers_sig)
AD_pct <- melt(subset(MonocyteSubtype.markers_sig, Gene%in%ADgenes$Gene),
id.vars = c("Gene","avg_logFC"), measure.vars = c("pct.2", "pct.1"),
variable.name = "Group", value.name = "pct")
# Fraction Cells
ggplot(data=AD_pct, aes(x=Group, y=pct, fill=Gene)) + geom_col(position="dodge") +
labs(title = "Fraction of Cells Expressing AD-associated Genes", y="Fraction of Cells", x="Monocyte Subtype") +
scale_x_discrete(labels=c("Cluster0\n(Canonical)", "Cluster1\n(Intermediate)")) # LogFC
ggplot(data=AD_pct, aes(x=Gene, y=avg_logFC, fill=Gene)) + geom_col(position="dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") +
labs(title = "logFC of AD-associated DEGs:\nCanonical vs. Intermediate Monocytes", y="avg_logFC", x="Monocyte Subtype") clustDAT <- SubsetData(DAT, subset.name = "post_clustering", accept.value = c(0,1), do.scale = F)
markerDF <- get_markerDT(clustDAT, markerList = ADgenes$Gene, rawData = T)
markerMatrix <- reshape2::acast(markerDF, Gene~post_clustering, value.var="Expression",
fun.aggregate = mean, drop = F, fill = 0)
# Heatmap.2
my_palette <- colorRampPalette(c("purple", "black", "cyan"))(n = 1000)
gplots::heatmap.2(markerMatrix, xlab = "Cluster", dendrogram = "row",
col = my_palette, tracecol = "gray", srtCol = 0, offsetCol=1.5, vline=T,
trace='none', key.title=NA, key.ylab = "Expression", colsep=T, sepwidth = 0.01) enrichr_dbs <- c("KEGG_2018", "Reactome_2016",
"GO_Biological_Process_2018", "GO_Molecular_Function_2018", "GO_Cellular_Component_2018",
"Rare_Diseases_AutoRIF_ARCHS4_Predictions", "Human_Gene_Atlas")
# createDT(enrichR::listEnrichrDbs(), "Enrichr Databases")
geneList <- data.frame(Gene=row.names(MonocyteSubtype.markers),
Weight=scales::rescale(length(MonocyteSubtype.markers$p_val_adj):1))
results <- enrichr(genes = geneList, databases = enrichr_dbs ) Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.
for (db in enrichr_dbs){
cat('\n')
cat("### ",db,"\n")
# res <- subset(results[[db]], Adjusted.P.value<=0.05)
createDT_html(results[[db]], paste("Enrichr Results:",db))
cat('\n')
}